In this notebook, we will see different regression methods and how well do they predict on our dataset, together with methods like:
The dataset used is from Kaggle, named PetFinder.my Adoption Prediction.
Contestants are required to predict this value. The value is determined by how quickly, if at all, a pet is adopted. The values are determined in the following way:
Here we are defining the libraries and some functions that we will use later on our data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import plotly
import plotly.offline
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import BayesianRidge, RidgeCV, LassoCV, LinearRegression, HuberRegressor, ElasticNetCV, Lasso, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from scipy import stats
from scipy.stats import skew
from sklearn.preprocessing import RobustScaler
sns.set()
## Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
This part of the analysis provides pretprocessing (cleaning the noisy data, seeing if there are any outliers, encoding the attributes if it is needed) and a better understanding of the dataset through visualizations. This is very important because it would help us choose the right technique for data mining and the right estimator for the accuracy of the algorithms.
We can see that the dataset has 14994 rows, while we will test 2998 rows, which shows us is 80-20 split, but we will do this later in the next step with sklearn train_test_split.
We also see that there are many null values in the Names attribute, and that is the reason why we are going to add a new class named Undefined for each of those with missing names.After cleaning the dataset, we can see that we still have one column with null values: Speed, that we are going to drop the Speed column in the process of cleaning data, since we already have a column Addoption speed.
After cleaning the train and test dataset, we will drop the Description column as irrelevant for this prediction because we are not going to use natural language processing algorithms.
#importing the dataset
df_train = pd.read_csv("data/train.csv")
print(df_train.info())
# Everything except AdoptionSPeed variable
X = df_train.drop("AdoptionSpeed", axis=1)
# Everything except Name, RescueID,Description,PetID,State variables
X=df_train.drop(columns = ['Name','RescuerID','Description','PetID','State'])
# Target variable
y = df_train["AdoptionSpeed"]
# Independent variables (no target column)
print(X.head())
print(y.head())
np.random.seed(42)
train_df,test_df=train_test_split(df_train,train_size=11900,test_size=2998)
test_df.info()
#Get first five rows from dataset
df_train.head()
#Get all columns names
print(df_train.columns)
In the column Name and Description, there are some missing values that we are going to fill with a new class Undef
#For every columns with this function we see if somewhere in database hass null value
#But this is not so clearly
print(df_train.isnull())
# Print how many null values we have for all attributes
print(df_train.isnull().values.any())
df_train.isnull().sum()
#Here is an example where Name is NaN in 36 row
df_train[30:40]
#(rows, columns)
df_train.shape
#(rows, columns) without null values
df_train.dropna().shape
We can conclude that the missing values are a name and a description and that it is better to fill them in than to delete them. So we made a new class for that purpose which is called 'Undef' and we use it to fill in the place where the value is missing (is not defined).
df_train=df_train.fillna({'Name':'Undef','Description':'Undef'})
df_train[30:40]
df_train.info()
test_df['Name']=test_df['Name'].fillna('Undef')
test_df
test_df=test_df.fillna({'Name':'Undef','Description':'Undef'})
#since both of the datasets are following the same format the next summary statistics will be under the treining test only
df_train.describe()
features = df_train.columns.values[:]
number_of_features = len(features)
print("Number of features: ", number_of_features, features)
Even though Black and Brown are the most common colors for pets in the dataset There are more than 2000 black colored pets who are unadopted.
Let's also analyse AdoptionSpeed across various combinations of animal color. (ColorName1,ColorName2 and ColorName3)
color_labels = pd.read_csv("data/color_labels.csv")
new_dataFrame = pd.read_csv("data/train.csv")
new_dataFrame.columns
color_labels.columns
color_labels
new_dataFrame
color_labels.rename(columns={'ColorID':'Color1','ColorName':'ColorName1'},inplace = True)
new_dataFrame=new_dataFrame.merge(color_labels,on='Color1',how='left')
new_dataFrame.columns
color_labels.rename(columns={'Color1':'Color2','ColorName1':'ColorName2'},inplace = True)
new_dataFrame=new_dataFrame.merge(color_labels,on='Color2',how='left')
new_dataFrame.columns
color_labels.rename(columns={'Color2':'Color3','ColorName2':'ColorName3'},inplace = True)
new_dataFrame=new_dataFrame.merge(color_labels,on='Color3',how='left')
new_dataFrame = new_dataFrame.drop(["Color1",'Color2','Color3'],axis = 1)
new_dataFrame.columns
new_dataFrame["All_Colors"] = new_dataFrame["ColorName1"] + new_dataFrame["ColorName2"] + new_dataFrame["ColorName3"]
new_dataFrame["All_Colors"].unique()
#Top 5 color
top_5_colors = new_dataFrame["All_Colors"].value_counts()[:5]
top_5_color = dict(top_5_colors)
top_5_color
sum_fee = df_train["Fee"].value_counts()[:10]
sum_fee
print(sum_fee.sum())
As we can see, more than 14,000 pets in the data are bought in under 300 Singapore Dollars. Most of the Animals (12663) in the listing who are adopted are free of cost. Let's analyse the Adoption Speed Based on the Fee charged and Type for the Animal
plt.figure(figsize=(12,8))
sns.scatterplot(x='AdoptionSpeed',y='Fee',data=df_train,palette="summer",hue='Type')
From the plot, it is pretty clear that most pets within fee range below 1000 dollars are adopted more. There is only 1 dog costing 3000 $ who also is adopted within 2-3 months. This basically means that generally people adopt pets who are mid-range expensive
#plotting target column
f,a=plt.subplots(figsize=(8,6))
plt.title('Before log(1+x)')
sns.distplot(df_train['AdoptionSpeed'],fit=stats.norm)
plt.show()
obj = go.Box(y=df_train["AdoptionSpeed"], name="AdoptionSpeed", boxmean='sd', boxpoints = 'all')
fig = go.Figure([obj])
plotly.offline.iplot(fig, filename="Adoption Speed Box Plot")
With this matrix we want to see how much correlated are the independent columns with the dependent AdoptionSpeed. As a metric we are choosing the Spearman Correlation Metric since it checks monotony, which we can see if there is any relation beside the linear one and it works well with ordinal and continous variables together.
corrmat = df_train.corr(method='spearman')
layout = go.Layout(width=1000,
height=1000,
autosize=False)
obj = go.Heatmap(z=corrmat.values, x=corrmat.columns.values, y=corrmat.columns.values)
fig = go.Figure([obj], layout)
plotly.offline.iplot(fig, filename="Correlation matrix")
For our plot, we will get all the features that have absolute value above 0.5 with the dependent AdoptionSpeed.
obj = go.Splom(dimensions=[dict(label='Type', values = df_train['Type']),
dict(label='Age', values=df_train['Age']),
dict(label='Breed1', values=df_train['Breed1']),
dict(label='Breed2', values=df_train['Breed2']),
dict(label='Gender', values=df_train['Gender']),
dict(label='Color1', values=df_train['Color1']),
dict(label='Color2', values=df_train['Color2']),
dict(label='Color3', values=df_train['Color3']),
dict(label='MaturitySize', values=df_train['MaturitySize']),
dict(label='FurLength', values=df_train['FurLength']),
dict(label='Vaccinated', values=df_train['Vaccinated']),
dict(label='Dewormed', values=df_train['Dewormed']),
dict(label='Sterilized', values=df_train['Sterilized']),
dict(label='Health', values=df_train['Health']),
dict(label='Quanitity', values=df_train['Quantity']),
dict(label='Fee', values=df_train['Fee']),
dict(label='State', values=df_train['State']),
dict(label='VideoAmt', values=df_train['VideoAmt']),
dict(label='PhotoAmt', values=df_train['PhotoAmt']),
dict(label='AdoptionSpeed', values=df_train['AdoptionSpeed'])],
marker=dict(size=5,
line=dict(width=0.5,
color='rgb(230,230,230)')),
diagonal=dict(visible=False),
showupperhalf=False)
axisd = dict(showline=False,
zeroline=False,
gridcolor='#fff')
layout = go.Layout(title="Pairplot for adoption speed",
dragmode='select',
width=2000,
height=2000,
autosize=False,
hovermode='closest',
plot_bgcolor='rgba(240,240,240, 0.95)',
xaxis1=dict(axisd), xaxis2=dict(axisd), xaxis3=dict(axisd), xaxis4=dict(axisd), xaxis5=dict(axisd), xaxis6=dict(axisd),
xaxis7=dict(axisd), xaxis8=dict(axisd), xaxis9=dict(axisd), xaxis10=dict(axisd), xaxis11=dict(axisd), xaxis12=dict(axisd),
xaxis13=dict(axisd), xaxis14=dict(axisd), xaxis15=dict(axisd), xaxis16=dict(axisd), xaxis17=dict(axisd), xaxis18=dict(axisd),
xaxis19=dict(axisd),xaxis20=dict(axisd),
yaxis1=dict(axisd), yaxis2=dict(axisd), yaxis3=dict(axisd), yaxis4=dict(axisd), yaxis5=dict(axisd), yaxis6=dict(axisd),
yaxis7=dict(axisd), yaxis8=dict(axisd), yaxis9=dict(axisd), yaxis10=dict(axisd), yaxis11=dict(axisd), yaxis12=dict(axisd),
yaxis13=dict(axisd), yaxis14=dict(axisd), yaxis15=dict(axisd), yaxis16=dict(axisd), yaxis17=dict(axisd), yaxis18=dict(axisd),
yaxis19=dict(axisd),yaxis20=dict(axisd))
fig = go.Figure(data=[obj], layout=layout)
plotly.offline.iplot(fig, filename="pairplot")
Data_train = df_train.drop(columns = ['Name','RescuerID','Description','PetID','State'])
print(Data_train)
scaler = StandardScaler()
Data_train = scaler.fit_transform(Data_train)
print("Data_train scaled =\n", Data_train)
PCA for short, is a method for reducing the dimensionality of data. It can be thought of as a projection method where data with m-columns (features) is projected into a subspace with m or fewer columns, whilst retaining the essence of the original data. ) It is used to explain the variance-covariance structure of a set of variables through linear combinations. It is often used as a dimensionality-reduction technique.
pcaT = PCA(n_components=2)
principal_components_train = pcaT.fit_transform(Data_train)
print("principal_components (TRAIN) =\n", principal_components_train)
plt.figure(figsize=(20, 20))
plt.scatter(principal_components_train[:, 0], principal_components_train[:, 1])
plt.show()
Since our data in the column for Adoption Speed are numbers that are in an ascending manner and acording to the explanation for the meaning of each value it is intuitively that we would like to try regression techniques, but also we would like to make sure that regression is the right choice and that is why we would apply log transformation function and plot it. To prevent overfitting, maybe better choice would be to try some classification methods that are more suitable for this dataset. Because the classes are not binary, but multiclass, and most of the features by which we would like to classify are already labeled we would use some classification algorithms for multiclass-value attributes because this dataset is suitable for them.
As we can see from the pair plot, there are few points that don't fit with the crowd. There are especially visible on Fee and AdoptionSpeed cel-plot. It is a little unusual to adopt a pet with so high fees that has been given for adoption for a longer time, but we won't treat this values as outlier so we are not going to remove them. In this step, we will apply log transformation on AdoptionSpeed.
#log it
train_df["AdoptionSpeed"] = np.log1p(train_df["AdoptionSpeed"])
test_df["AdoptionSpeed"] = np.log1p(test_df["AdoptionSpeed"])
#plot again
f,a=plt.subplots(figsize=(8,6))
plt.title('After log(1+x)')
sns.distplot(train_df["AdoptionSpeed"],fit=stats.norm)
f,a=plt.subplots(figsize=(8,6))
stats.probplot(train_df["AdoptionSpeed"],plot=plt)
plt.show()
After theese plots, we decided that classification techniques may provide higher accuracy and prevent overfitting, because we are training a dataset with 14900 rows.
To start classifying, we firstly need to divide the datset into separate training and testing subsets, where the target feature in this case AdoptionSpeed, is set in the y variable, and the other relevant features that will play deciding role in the classification are the Type, Breed, Age, Colors, Quanity, MaturityAge, the pet being Dewormed, Sterilized and Vaccined, the length of the Furr, number of photos or videos uploaded, Health, which are in the X.
#This code was just for adjustment of the class Type, in case we make another clssification with different target
# Everything except Type variable
#X = df_train.drop("AdoptionSpeed", axis=1)
# Everything except Name, RescueID,Description,PetID,State variables
X=df_train.drop(columns = ['Name','RescuerID','Description','PetID','State','AdoptionSpeed'])
# Target variable
#y1 = df_train["Type"]
# Independent variables (no target column)
#print(X1.head())
X.tail()
# Random seed for reproducibility
np.random.seed(42)
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, # independent variables
y, # dependent variable
test_size = 0.2) # percentage of data to use for test set
X_train.head()
From here we can see that our Trainign data set is randomly choosen.
y_train, len(y_train)
Now we've got our data prepared, we can start to fit models. We'll be using the following and comparing their results.
1.Logistic Regression - LogisticRegression()
2.K-Nearest Neighbors - KNeighboursClassifier()
3.RandomForest - RandomForestClassifier()
All of the algorithms in the Scikit-Learn library use the same functions, for training a model, model.fit(X_train, y_train) and for scoring a model model.score(X_test, y_test). score() returns the ratio of correct predictions (1.0 = 100% correct).
models = {"KNN": KNeighborsClassifier(),
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier()}
# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
"""
Fits and evaluates given machine learning models.
models : a dict of different Scikit-Learn machine learning models
X_train : training data
X_test : testing data
y_train : labels assosciated with training data
y_test : labels assosciated with test data
"""
# Random seed for reproducible results
np.random.seed(42)
# Make a list to keep model scores
model_scores = {}
# Loop through models
for name, model in models.items():
# Fit the model to the data
model.fit(X_train, y_train)
# Evaluate the model and append its score to model_scores
model_scores[name] = model.score(X_test, y_test)
return model_scores
import warnings; warnings.simplefilter('ignore')
model_scores = fit_and_score(models=models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
model_scores
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();
There's one main hyperparameter we can tune for the K-Nearest Neighbors (KNN) algorithm, and that is number of neighbours. The default is 5 (n_neigbors=5). A small value of k means that noise will have a higher influence on the result and a large value make it computationally expensive, so we will leave the default.
# Create a list of train scores
train_scores = []
# Create a list of test scores
test_scores = []
# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20
# Setup algorithm
knn = KNeighborsClassifier()
# Loop through different neighbors values
for i in neighbors:
knn.set_params(n_neighbors = i) # set neighbors value
# Fit the algorithm
knn.fit(X_train, y_train)
# Update the training scores
train_scores.append(knn.score(X_train, y_train))
# Update the test scores
test_scores.append(knn.score(X_test, y_test))
#KNN train scores
train_scores
#Due to better understanding we are going to plot them
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()
print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")
We've tuned KNN by hand but let's see how we can LogisticsRegression and RandomForestClassifier using RandomizedSearchCV.
Instead of us having to manually try different hyperparameters by hand, RandomizedSearchCV tries a number of different combinations, evaluates them and saves the best.
Reading the Scikit-Learn documentation for LogisticRegression, we find there's a number of different hyperparameters we can tune. The same for RandomForestClassifier.
Let's create a hyperparameter grid (a dictionary of different hyperparameters) for each and then test them out. We will use RandomizedSearchCV to try and tune our LogisticRegression model.
We'll pass it the different hyperparameters from log_reg_grid as well as set n_iter = 20. This means, RandomizedSearchCV will try 20 different combinations of hyperparameters from log_reg_grid and save the best ones.
log_reg_grid = {"C": np.logspace(-4, 4, 20),
"solver": ["liblinear"]}
# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
param_grid=log_reg_grid,
cv=5,
verbose=True)
# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train);
# Check the best parameters
gs_log_reg.best_params_
# Evaluate the model
gs_log_reg.score(X_test, y_test)
Our grid only has a maximum of 20 different hyperparameter combinations.
Note: If there are a large amount of hyperparameters combinations in your grid, GridSearchCV may take a long time to try them all out. This is why it's a good idea to start with RandomizedSearchCV, try a certain amount of combinations and then use GridSearchCV to refine them.
Now we've got a tuned model, let's get some of the metrics we discussed before.
We want:
ROC curve and AUC score - plot_roc_curve()
Confusion matrix - confusion_matrix()
Classification report - classification_report()
Precision - precision_score()
Recall - recall_score()
F1-score - f1_score()
Luckily, Scikit-Learn has these all built-in.
To access them, we'll have to use our model to make predictions on the test set. You can make predictions by calling predict() on a trained model and passing it the data you'd like to predict on.
We'll make predictions on the test data.
# Make preidctions on test data
y_preds = gs_log_reg.predict(X_test)
y_preds
y_test
A confusion matrix is a visual way to show where your model made the right predictions and where it made the wrong predictions (or in other words, got confused).
Scikit-Learn allows us to create a confusion matrix using confusion_matrix() and passing it the true labels and predicted labels.
# Display confusion matrix
print(confusion_matrix(y_test, y_preds))
# Import Seaborn
import seaborn as sns
sns.set(font_scale=1.5) # Increase font size
def plot_conf_mat(y_test, y_preds):
"""
Plots a confusion matrix using Seaborn's heatmap().
"""
fig, ax = plt.subplots(figsize=(3, 3))
ax = sns.heatmap(confusion_matrix(y_test, y_preds),
annot=True, # Annotate the boxes
cbar=False)
plt.xlabel("true label")
plt.ylabel("predicted label")
plot_conf_mat(y_test, y_preds)
We can make a classification report using classification_report() and passing it the true labels as well as our models predicted labels.
A classification report will also give us information of the precision and recall of our model for each class.
import warnings; warnings.simplefilter('ignore')
# Show classification report
print(classification_report(y_test, y_preds))
# Check best hyperparameters
gs_log_reg.best_params_
# Import cross_val_score
from sklearn.model_selection import cross_val_score
# Instantiate best model with best hyperparameters (found with GridSearchCV)
clf = LogisticRegression(C=0.23357214690901212,
solver="liblinear")
# Cross-validated accuracy score
cv_acc = cross_val_score(clf,
X,
y,
cv=5, # 5-fold cross-validation
scoring="accuracy") # accuracy as scoring
cv_acc
# Since there are 5 metrics here, we'll take the average.
cv_acc = np.mean(cv_acc)
cv_acc
# Fit an instance of LogisticRegression (taken from above)
clf.fit(X_train, y_train);
# Check coef_
clf.coef_
features_dict = dict(zip(df_train.columns, list(clf.coef_[0])))
features_dict
#Now we've match the feature coefficients to different features, let's visualize them.
# Visualize feature importance
features_df = pd.DataFrame(features_dict, index=[0])
features_df.T.plot.bar(title="Feature Importance", legend=False);
pd.crosstab(df_train["AdoptionSpeed"],df_train["Gender"])
pd.crosstab(df_train["AdoptionSpeed"],df_train["Type"])
pd.crosstab(df_train["AdoptionSpeed"],df_train["Dewormed"])
pd.crosstab(df_train["AdoptionSpeed"],df_train["Vaccinated"])
df_train.groupby(['AdoptionSpeed']).mean()
%matplotlib inline
import matplotlib.pyplot as plt
df_train["AdoptionSpeed"].hist()
#scatter plot
fig,ax=plt.subplots(figsize=(10,10))
ax.scatter(df_train["AdoptionSpeed"],df_train["PhotoAmt"])
Since we saw in our previous steps that RandomForestClassifier gave us the best score, we wanted to use another kernel baseline random forest, to demonstrate a routine, get_class_bounds(), that maps real-valued ordinal classes to integer classes based on the known class-distribution of the target y values.
This kernel uses Kseniia Palin's kernel baseline random forest to demonstrate a routine, get_class_bounds(), that maps real-valued ordinal classes to integer classes based on the known class-distribution of the target y values.
Comments have been added to Palin's code but otherwise the actual data preparation and model fitting are left as-is; the purpose of this kernel is to demonstrate get_class_bounds() rather than optimize the model. Note that Palin's implementation here generates Test predictions for each of the k-folds and averages those predictions; this is different from fitting the whole training set and using that model to generate a single Test prediction.
get_class_bounds() is similar to the OptimizedRounder used in other kernels, but it runs more quickly and it may be less prone to overfitting. It could also be used in OptimizedRounder to set the initial boundary values, e.g., in place of initial_coef = [0.5, 1.5, 2.5, 3.5].
df_train.head()
test_df = pd.read_csv("data/test.csv")
# Define routines to read in the Training,Test sentiment score and magnitude;
# 0,0 is returned if there is no file found.
# Note that when used the argument fn will be one row of a dataframe
# in shich case fn['PetID'] is the PetId.
def readFile(fn):
file = '../WBS-Project/data/train/'+fn['PetID']+'.json'
if os.path.exists(file):
with open(file) as data_file:
data = json.load(data_file)
df = json_normalize(data)
mag = df['documentSentiment.magnitude'].values[0]
score = df['documentSentiment.score'].values[0]
return pd.Series([mag,score],index=['mag','score'])
else:
return pd.Series([0,0],index=['mag','score'])
def readTestFile(fn):
file = '../WBS-Project/data/test/'+fn['PetID']+'.json'
if os.path.exists(file):
with open(file) as data_file:
data = json.load(data_file)
df = json_normalize(data)
mag = df['documentSentiment.magnitude'].values[0]
score = df['documentSentiment.score'].values[0]
return pd.Series([mag,score],index=['mag','score'])
else:
return pd.Series([0,0],index=['mag','score'])
# Here the routines above are applied to each row of the dataframes.
# This is done using panadas' `apply()` with a small "anonymous function" defined with a python `lambda`.
# Note that just `train` could be used inplace of `train[['PetID']]`,
# this would make it clearer that x is a row of the dataframe and not the PetID value.
df_train[['SentMagnitude', 'SentScore']] = df_train[['PetID']].apply(lambda x: readFile(x), axis=1)
test_df[['SentMagnitude', 'SentScore']] = test_df[['PetID']].apply(lambda x: readTestFile(x), axis=1)
# Setup the training X, y, and test X
train_X = df_train.drop(['Name', 'Description', 'RescuerID', 'PetID', 'AdoptionSpeed'], axis=1)
train_y = df_train['AdoptionSpeed']
test_X = test_df.drop(['Name', 'Description', 'RescuerID', 'PetID'], axis=1)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier
# Define what will be the final predicted train and test values
train_meta = np.zeros(train_y.shape)
test_meta = np.zeros(test_X.shape[0])
# Choose and initialize a model.
clf = RandomForestClassifier(bootstrap=True, criterion = 'gini', max_depth=80,
max_features='auto', min_samples_leaf=5,
min_samples_split=5, n_estimators=200)
# Divide the training data into k-folds, k=4 here.
splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=1812).split(train_X, train_y))
# Loop over the folds and fit the model to the fold's training data.
# Then evaluate that model on i) the validation data of that fold,
# and ii) on all of the test data.
for idx, (train_idx, valid_idx) in enumerate(splits):
# The training and validation sets for this fold
X_train = train_X.iloc[train_idx]
y_train = train_y[train_idx]
X_val = train_X.iloc[valid_idx]
y_val = train_y[valid_idx]
# Fit the model
clf.fit(X_train, y_train)
# Look at the validation kappa and accuracy with classes right from the model
y_pred = clf.predict(X_val)
print("Fold {}: accuracy = {:.1f}%, kappa = {:.4f} (no boundary adjustment)".format(idx,
100.0*accuracy_score(y_val, y_pred),
cohen_kappa_score(y_val, y_pred, weights='quadratic')))
#
# Assign real-valued classes in addition to the integer classes of y_pred.
# Start with the predicted probabilities by class
y_probs = clf.predict_proba(X_val)
# and get the class values (use a copy incase we change values)
class_vals = clf.classes_.copy()
# Change the ordinal weight of class 0 to be -1 as suggested by the plot in discussion:
# https://www.kaggle.com/c/petfinder-adoption-prediction/discussion/76265
# Does mot make much difference, though.
class_vals[0] = -1
# Create the float class values as the probability-weighted class
# Here a python "list comprehension" is used rather than a loop.
y_floats = [sum(y_probs[ix]*class_vals) for ix in range(len(y_probs[:,0]))]
#
# Save these y_float values instead of the y_pred integers;
##train_meta[valid_idx] = y_pred.reshape(-1)
train_meta[valid_idx] = y_floats
# the predictions for just this validation fold are saved in the train_meta array;
# looping over all folds will provide one prediction for each training sample.
# Now use this fold's same model to generate Test predictions.
##y_test = clf.predict(test_X)
# Instead of integer classes, get the predicted probabilites
test_probs = clf.predict_proba(test_X)
# and turn these into float class values.
# Unlike the validation case, we get a test prediction from every fold,
# so those float predictions are averaged. python list comprehension is used again.
##test_meta += y_test.reshape(-1) / len(splits)
test_meta += np.array([sum(test_probs[ix]*class_vals) for
ix in range(len(test_probs[:,0]))]) / len(splits)
# Next two routines are a way to map float regression values to ordinal classes
# by making use of the known distribution of the training classes.
# In the following, y_pred is a floating value, e.g., the output of a regression to the class.
# Many sklearn _classifiers_ can also provide probabilities of the classes which
# can be turned into a floating value as the probability-weighted class, e.g.,:
# y_probs = clf.predict_proba(X_val)
# # The class values; use a copy incase we want to modify the values
# class_vals = clf.classes_.copy()
# y_floats = [sum(y_probs[ix]*class_vals) for ix in range(len(y_probs[:,0]))]
def get_class_bounds(y, y_pred, N=5, class0_fraction=-1):
"""
Find boundary values for y_pred to match the known y class percentiles.
Returns N-1 boundaries in y_pred values that separate y_pred
into N classes (0, 1, 2, ..., N-1) with same percentiles as y has.
Can adjust the fraction in Class 0 by the given factor (>=0), if desired.
"""
ysort = np.sort(y)
predsort = np.sort(y_pred)
bounds = []
for ibound in range(N-1):
iy = len(ysort[ysort <= ibound])
# adjust the number of class 0 predictions?
if (ibound == 0) and (class0_fraction >= 0.0) :
iy = int(class0_fraction * iy)
bounds.append(predsort[iy])
return bounds
def assign_class(y_pred, boundaries):
"""
Given class boundaries in y_pred units, output integer class values
"""
y_classes = np.zeros(len(y_pred))
for iclass, bound in enumerate(boundaries):
y_classes[y_pred >= bound] = iclass + 1
return y_classes.astype(int)
# Look at the histogram of the predicted float class values.
plt.hist(train_meta, bins=50)
plt.title("Training: meta float values")
plt.xlabel("Training y float values")
plt.show()
# This cell calculates and plots the kappa (and MSE) vs the class0 fraction adjustment.
# Note that MSE prefers (lower MSE) a class0 fraction near/at 0,
# whereas kappa prefers (higher kappa) a fraction near 1.
# Then the class0 fraction that gives best training kappa is selected.
# Save values of kappa, MSE, and accuracy vs the class0 fraction
kappas = []
mses = []
accurs = []
# fractions to try... (could go larger than 1 if desired.)
cl0fracs = np.array(np.arange(0.01,1.001,0.01))
for cl0frac in cl0fracs:
boundaries = get_class_bounds(train_y, train_meta, class0_fraction=cl0frac)
train_meta_ints = assign_class(train_meta, boundaries)
kappa = cohen_kappa_score(df_train['AdoptionSpeed'], train_meta_ints, weights='quadratic')
kappas.append(kappa)
mse = mean_squared_error(df_train['AdoptionSpeed'], train_meta_ints)
mses.append(mse)
accur = accuracy_score(df_train['AdoptionSpeed'], train_meta_ints)
accurs.append(accur)
# Use the class0 fraction that gives the highest training kappa
ifmax = np.array(kappas).argmax()
cl0frac = cl0fracs[ifmax]
print("Best kappa for class0 fraction = {:.4f}".format(cl0frac))
# Plots to show the kappa, MSE, and Accuracy vs class0 fraction
plt.plot(cl0fracs, kappas)
# indicate the highest-kappa point
plt.plot([cl0frac],[kappas[ifmax]],marker="o",color="green")
plt.title("Training: kappa vs class0_fraction")
plt.xlabel("class0_fraction")
plt.ylabel("kappa")
plt.show()
plt.plot(cl0fracs, mses)
plt.title("Training: MSE vs class0_fraction")
plt.xlabel("class0_fraction")
plt.ylabel("MSE")
plt.show()
plt.plot(cl0fracs, accurs)
plt.title("Training: Accuracy vs class0_fraction")
plt.xlabel("class0_fraction")
plt.ylabel("Accuracy")
plt.show()
# Can skip the class0_fraction adjustment and plotting cells above;
# can delete those two cells and just uncomment this line:
##cl0frac = 1.0
print("Using class0_fraction = {:.4f}, gives boundaries:".format(cl0frac))
boundaries = get_class_bounds(train_y, train_meta, class0_fraction=cl0frac)
print(boundaries)
train_meta_ints = assign_class(train_meta, boundaries)
kappa = cohen_kappa_score(train_y, train_meta_ints, weights='quadratic')
print("Adjusted boundaries give:")
print("kappa = {:.4f} (with accuracy = {:.1f}%)".format(kappa,
100.0*accuracy_score(train_y, train_meta_ints)))
# Confusion Matrix
con_mat = confusion_matrix(train_y, train_meta_ints)
# Look at the number that are on the diagonal (exact agreement)
diag = 0.0
for id in range(5):
diag += con_mat[id,id]
print("\nConfusion matrix - Columns are prediced 0, predicted 1, etc.\n")
print(con_mat)
print("")
print("\n{2:.2f}% = {0}/{1} are on the diagonal (= accuracy)".format(
int(diag), con_mat.sum(), 100.0*diag/con_mat.sum()))
plt.hist(train_meta_ints, bins=40, color='blue')
plt.hist(train_y, bins=20, bottom=0.0, alpha=0.2)
plt.title("Train: Boundary-based Predictions")
plt.show()
plt.hist(test_meta, bins=50)
plt.title("Test: meta float values")
plt.show()
# Map the test values to integers using the training boundaries
test_meta_ints = assign_class(test_meta, boundaries)
plt.hist(test_meta_ints.astype(int), bins=50)
plt.title("Test: Boundary-based Predictions")
plt.show()
sub = pd.read_csv('../WBS-Project/data/sample_submission.csv')
sub['AdoptionSpeed'] = test_meta_ints
sub['AdoptionSpeed'] = sub['AdoptionSpeed'].astype(int)
sub.to_csv("submission.csv", index=False)
submission = pd.read_csv('submission.csv')
submission.head()
Because of the low scores of accuracy, now we will try to adjust our dataset, in that way that we will make our target feature binary. For the values 0,1,2 we will put 0, and for the values 3 and 4 we will put 1 and train tha data so we would compare the scores. In the next steps we will use SMOTE, to balance the samples of 0 and 1 so that we will have bigger trust in the further testing of the scores from the training models.
#tmp=df_train["Type"==2].fillna(0,inplace=True)
tmp=df_train
tmp
tmp['AdoptionSpeed']=tmp['AdoptionSpeed'].replace([1],0)
tmp['AdoptionSpeed']=tmp['AdoptionSpeed'].replace([2],0)
tmp['AdoptionSpeed']=tmp['AdoptionSpeed'].replace([3],1)
tmp['AdoptionSpeed']=tmp['AdoptionSpeed'].replace([4],1)
tmp
# Everything except AdoptionSpeed variable
#and the irrelevant attributes
X1=tmp.drop(columns = ['Name','RescuerID','Description','PetID','State','AdoptionSpeed','SentMagnitude','SentScore'])
y1=tmp["AdoptionSpeed"]
# Target variable
#y1 = df_train["Type"]
# Independent variables (no target column)
#print(X1.head())
X1.tail()
#reassigning the target value,
#but now the values are adjusted so the type of the class is binary
# Target variable
y1 = tmp["AdoptionSpeed"]
y1
# Random seed for reproducibility
np.random.seed(42)
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X1, # independent variables
y1, # dependent variable
test_size = 0.2) # percentage of data to use for test set
tmp.groupby('AdoptionSpeed').count()
In cases where the target feature has imbalanced amount of values, this method - SMOTE is used to give better balance to the dataset, so there is better chance that the accuracy would be more precise. Since we checked that our class does not have imbalance, we are not going to use SMOTE.
#from collections import Counter
#from imblearn.over_sampling import SMOTE
#print (Counter(y1))
#sm = SMOTE(random_state=42)
#X_res, y_res = sm.fit_resample(X1, y1)
#print('Resampled dataset shape %s' % Counter(y_res))
#print (X_res.shape)
#print (y_res.shape)
#X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=1,stratify=y_res)
X1
#data_final_vars=X1.reshape(1, -1)
data_final_vars=X1
y1=['y']
X1=[i for i in data_final_vars if i not in y]
from sklearn.feature_selection import RFE
lr=LogisticRegression()
rfe=RFE(lr,20)
rfe=rfe.fit(X_train, y_train)
print(rfe.support_)
print(rfe.ranking_)
Recursive Feature Elimination(RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features.This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.
The ReF has helped us select the following features: 'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt'.
cols=['Type',
'Age',
'Breed1',
'Breed2',
'Gender',
'Color1',
'Color2',
'Color3',
'MaturitySize',
'FurLength',
'Vaccinated',
'Dewormed',
'Sterilized',
'Health',
'Quantity',
'Fee',
'VideoAmt',
'PhotoAmt']
X1=X_train[cols]
y1=y_train
import statsmodels.api as sm
logit_model=sm.Logit(y1,X1)
result=logit_model.fit()
print(result.summary2())
The p -values for most of the variables are smaller than 0.05, except 7 of them, therefore, we will remove them.
cols=['Type',
'Age',
'Breed1',
'Breed2',
'Gender',
'Color1',
'FurLength',
'Vaccinated',
'Dewormed',
'Sterilized',
'Quantity']
X1=X_train[cols]
y1=y_train
logit_model=sm.Logit(y1,X1)
result=logit_model.fit()
print(result.summary2())
from sklearn import metrics
X_train,X_test,y_train,y_test=train_test_split(X1,y1,test_size=0.2,random_state=0)
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
#predicting the test_set results and calculating accuracy
y_pred=logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test,y_test)))
logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred=logistic_regression.predict(X_test)
cr_logistic_regression=classification_report(y_test,y_pred)
cm_logistic_regression=confusion_matrix(y_test,y_pred)
print(cr_logistic_regression)
print (cm_logistic_regression)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
cr_gnb=classification_report(y_test,y_pred)
cm_gnb=confusion_matrix(y_test,y_pred)
print(cr_gnb)
print (cm_gnb)
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier()
clf_gini = clf_gini.fit(X_train,y_train)
y_pred = clf_gini.predict(X_test)
cr_decision_tree_gini=classification_report(y_test,y_pred)
cm_decision_tree_gini=confusion_matrix(y_test,y_pred)
print(cr_decision_tree_gini)
print (cm_decision_tree_gini)
print("Gini decision tree depth: ",clf_gini.get_depth())
clf_entropy = DecisionTreeClassifier(criterion='entropy')
clf_entropy = clf_entropy.fit(X_train,y_train)
y_pred = clf_entropy.predict(X_test)
cr_decision_tree_entropy=classification_report(y_test,y_pred)
cm_decision_tree_entropy=confusion_matrix(y_test,y_pred)
print(cr_decision_tree_entropy)
print (cm_decision_tree_entropy)
print("Entropy decision tree depth: ",clf_entropy.get_depth())
max_depth = []
acc_gini = []
acc_entropy = []
f1_gini =[]
f1_entropy = []
for i in range(1,30):
dtree = DecisionTreeClassifier(criterion='gini', max_depth=i)
dtree.fit(X_train, y_train)
pred = dtree.predict(X_test)
acc_gini.append(accuracy_score(y_test, pred)*100)
f1_gini.append(f1_score(y_test, pred)*100)
####
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=i)
dtree.fit(X_train, y_train)
pred = dtree.predict(X_test)
acc_entropy.append(accuracy_score(y_test, pred)*100)
f1_entropy.append(f1_score(y_test, pred)*100)
####
max_depth.append(i)
d = pd.DataFrame({'acc_gini':pd.Series(acc_gini),
'acc_entropy':pd.Series(acc_entropy),
'f1_gini':pd.Series(f1_gini),
'f1_entropy':pd.Series(f1_entropy),
'max_depth':pd.Series(max_depth)})
# visualizing changes in parameters
plt.plot('max_depth','acc_gini', data=d, label='Accuracy Gini')
plt.plot('max_depth','acc_entropy', data=d, label='Accuracy Entropy')
plt.plot('max_depth','f1_gini', data=d, label='F1 Gini')
plt.plot('max_depth','f1_entropy', data=d, label='F1 Entropy')
plt.xlabel('max_depth')
plt.ylabel('accuracy/gini (%)')
plt.legend();
print("Maximum Gini tree accuracy score on the test data: ",max(acc_gini)," at max depth: ",np.argmax(acc_gini)+1)
print("Maximum Gini f1 score on the test data: ",max(f1_gini)," at max depth: ",np.argmax(f1_gini)+1)
print("Maximum Entropy tree accuracy score on the test data: ",max(acc_entropy)," at max depth: ",np.argmax(acc_entropy)+1)
print("Maximum Entropy f1 score on the test data: ",max(f1_entropy)," at max depth: ",np.argmax(f1_entropy)+1)
# we find out that the best Tree Classifier is the one witch uses Gini index and has maximum depth 7
clf = DecisionTreeClassifier(max_depth=7)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
cr_dt=classification_report(y_test,y_pred)
cm_dt=confusion_matrix(y_test,y_pred)
print(cr_dt)
print (cm_dt)
accuracy_scores = []
f1_scores = []
neighbors = range(2, 20)
knn = KNeighborsClassifier()
for i in neighbors:
knn.set_params(n_neighbors = i)
knn.fit(X_train, y_train)
y_pred=knn.predict(X_test)
accuracy_scores.append(accuracy_score(y_test,y_pred)*100)
f1_scores.append(f1_score(y_test,y_pred)*100)
plt.plot(neighbors,accuracy_scores , label="Accuracy score")
plt.plot(neighbors, f1_scores, label="F1 score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()
print(f"Maximum KNN accuracy score on the test data: {max(accuracy_scores):.2f}%")
print(f"Maximum KNN f1 score on the test data: {max(f1_scores):.2f}%")
# best for n_neghboours = 2
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
y_pred=knn.predict(X_test)
cr_knn=classification_report(y_test,y_pred)
cm_knn=confusion_matrix(y_test,y_pred)
print(cr_knn)
print (cm_knn)
random_forest = RandomForestClassifier(max_depth=10, random_state=0)
random_forest.fit(X_train,y_train)
y_pred=random_forest.predict(X_test)
cr_rf=classification_report(y_test,y_pred)
cm_rf=confusion_matrix(y_test,y_pred)
print(cr_rf)
print (cm_rf)
After the machine learning methods were tested, here is a testing with a Deep Learning technique.
# Random seed for reproducibility
np.random.seed(42)
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, # independent variables
y, # dependent variable
test_size = 0.2) # percentage of data to use for test set
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam, SGD, Nadam
from keras.regularizers import l1, l1_l2, l2
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras import backend as K
def rmse_dl(y_true, y_pred):
return K.sqrt(K.mean(K.square(y_pred - y_true)))
regressor = Sequential()
regressor.add(Dense(units=6, input_dim=X_train.shape[1],activation='linear', kernel_initializer='random_uniform'))
regressor.add(BatchNormalization())
regressor.add(Dropout(0.2))
# if we want to reduce rmse/increase training error, uncomment this
#regressor.add(Dense(units=10, activation='relu', kernel_initializer='random_uniform',activity_regularizer=l1_l2(0.5, 0.5)))
#regressor.add(BatchNormalization())
regressor.add(Dense(units=5, activation='linear', kernel_initializer='random_uniform', activity_regularizer=l1(6.8)))
regressor.add(BatchNormalization())
regressor.add(Dense(units=1))
#decay=0.000002
adam = Nadam(lr=0.0005, schedule_decay=0.004)
regressor.compile(optimizer=adam, loss='mse', metrics=[rmse_dl])
#EarlyStopping(monitor='val_loss',min_delta=1e-5, patience=57)
regressor.fit(X_train, y_train, batch_size=32, epochs=550, shuffle=True,
validation_split=0.1, verbose=1)
sgd = SGD(lr=0.0000002, decay=0.000001,momentum=0.7 ,nesterov=True)
regressor.compile(optimizer=sgd, loss='mse', metrics=[rmse_dl])
regressor.fit(X_train, y_train, batch_size=32, epochs=25, shuffle=True,
validation_split=0.1, verbose=1)
predicted = regressor.predict(X_test)
print("RMSE Neural Network log final: %.3f"
% np.sqrt(mean_squared_error(y_test, predicted)))
print("RMSE Neural Network final: %.3f"
% np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(predicted))))